Load data from Nature 488, pp. 621-626:
STAT.RData
load('STAT.RData')
library(phyloseq)
head(sample_data(phy))
## Sample Data: [6 samples by 6 sample variables]:
## X.SampleID BarcodeSequence LinkerPrimerSequence Treatment
## cecal_C1 cecal_C1 NA NA C
## cecal_C10 cecal_C10 NA NA C
## cecal_C2 cecal_C2 NA NA C
## cecal_C3 cecal_C3 NA NA C
## cecal_C4 cecal_C4 NA NA C
## cecal_C5 cecal_C5 NA NA C
## Location Description
## cecal_C1 cecal missing_description
## cecal_C10 cecal missing_description
## cecal_C2 cecal missing_description
## cecal_C3 cecal missing_description
## cecal_C4 cecal missing_description
## cecal_C5 cecal missing_description
levels(sample_data(phy)$Treatment)
## [1] "C" "P" "T" "V" "VP"
head(phenotypes)
## GIP Insulin Leptin IGF1 BMD mFat pFat
## C1 8.61 1 2320 1250 0.0492 4.0 19.7
## C2 28.60 137 1040 2180 0.0487 3.7 18.6
## C3 19.20 543 1670 2160 0.0510 3.6 19.5
## C4 38.30 606 972 2070 0.0465 4.0 21.3
## C5 13.20 1 2210 1240 0.0547 4.1 21.6
## C6 36.80 696 2330 1830 0.0512 3.7 17.7
phy
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 6547 taxa and 96 samples ]
## sample_data() Sample Data: [ 96 samples by 6 sample variables ]
## tax_table() Taxonomy Table: [ 6547 taxa by 9 taxonomic ranks ]
## phy_tree() Phylogenetic Tree: [ 6547 tips and 6321 internal nodes ]
sample_names(phy)[1:5]
## [1] "cecal_C1" "cecal_C10" "cecal_C2" "cecal_C3" "cecal_C4"
The sample names are in different formats: phenotypes are per mouse; phy data are per mouse-location.
Merge
phenotypesvariable with thephyloseqdata
pheno1 = phenotypes
pheno2 = phenotypes
rownames(pheno1) = paste('cecal', rownames(pheno1), sep="_")
rownames(pheno2) = paste('fecal', rownames(pheno2), sep="_")
pheno = rbind(pheno1, pheno2)
phy2 = merge_phyloseq(phy, sample_data(pheno))
phy
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 6547 taxa and 96 samples ]
## sample_data() Sample Data: [ 96 samples by 6 sample variables ]
## tax_table() Taxonomy Table: [ 6547 taxa by 9 taxonomic ranks ]
## phy_tree() Phylogenetic Tree: [ 6547 tips and 6321 internal nodes ]
phy2
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 6547 taxa and 96 samples ]
## sample_data() Sample Data: [ 96 samples by 13 sample variables ]
## tax_table() Taxonomy Table: [ 6547 taxa by 9 taxonomic ranks ]
## phy_tree() Phylogenetic Tree: [ 6547 tips and 6321 internal nodes ]
head(sample_data(phy2))
## Sample Data: [6 samples by 13 sample variables]:
## X.SampleID BarcodeSequence LinkerPrimerSequence Treatment
## cecal_C1 cecal_C1 NA NA C
## cecal_C10 cecal_C10 NA NA C
## cecal_C2 cecal_C2 NA NA C
## cecal_C3 cecal_C3 NA NA C
## cecal_C4 cecal_C4 NA NA C
## cecal_C5 cecal_C5 NA NA C
## Location Description GIP Insulin Leptin IGF1 BMD
## cecal_C1 cecal missing_description 8.61 1 2320 1250 0.0492
## cecal_C10 cecal missing_description 27.70 516 2680 3720 0.0479
## cecal_C2 cecal missing_description 28.60 137 1040 2180 0.0487
## cecal_C3 cecal missing_description 19.20 543 1670 2160 0.0510
## cecal_C4 cecal missing_description 38.30 606 972 2070 0.0465
## cecal_C5 cecal missing_description 13.20 1 2210 1240 0.0547
## mFat pFat
## cecal_C1 4.0 19.7
## cecal_C10 4.0 19.8
## cecal_C2 3.7 18.6
## cecal_C3 3.6 19.5
## cecal_C4 4.0 21.3
## cecal_C5 4.1 21.6
Estimate observed species with Chao1 and Shannon diversity using
estimate_richness()function
?estimate_richness
alpha = estimate_richness(phy2, measures=c("Observed", "Chao1", "Shannon"))
head(alpha)
## Observed Chao1 se.chao1 Shannon
## cecal_C1 662 1650.429 148.60209 4.836219
## cecal_C10 592 1508.453 153.81056 4.814829
## cecal_C2 617 1216.394 91.48394 4.775428
## cecal_C3 731 1746.219 144.87205 4.918447
## cecal_C4 601 1452.068 137.36109 4.636482
## cecal_C5 698 1434.391 104.96195 4.944320
plot(ecdf(sample_sums(phy2)))
Estimate rarified diversities in #3 averaged over 20 replicates and compare the two estimates
phy3 = subset_samples(phy2, sample_sums(phy2)>3500)
alpha = estimate_richness(phy3, measures=c("Observed", "Chao1", "Shannon"))
alpha.rare = matrix(0, ncol=ncol(alpha), nrow=nrow(alpha))
for(i in 1:20){
alpha.rare= alpha.rare +
estimate_richness(rarefy_even_depth(phy3, sample.size = 3500,
verbose=F, trimOTUs = F),
measures=c("Observed", "Chao1", "Shannon"))
}
alpha.rare = alpha.rare/20
plot(alpha$Observed, alpha.rare$Observed)
plot(alpha$Chao1, alpha.rare$Chao1)
plot(alpha$Shannon, alpha.rare$Shannon)
Summarize the data at the order level
order.phy = tax_glom(phy3, taxrank = "Order")
Normalize the data using CLR, relative abundance, and DESeq2
order.rel = transform_sample_counts(order.phy, function(x) x/sum(x))
order.clr = transform_sample_counts(order.phy, function(x){y=log(x+1); y/sum(y)})
library(DESeq2)
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, cbind, colnames,
## do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, lengths, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff,
## sort, table, tapply, union, unique, unsplit
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
##
## colMeans, colSums, expand.grid, rowMeans, rowSums
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following object is masked from 'package:phyloseq':
##
## distance
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
## Attaching package: 'Biobase'
## The following object is masked from 'package:phyloseq':
##
## sampleNames
countData = round(as(otu_table(order.phy), "matrix"), digits = 0)
countData = countData + 1L
dds <- DESeqDataSetFromMatrix(countData, sample_data(order.phy), design = ~1)
## Warning in class(from) <- NULL: Setting class(x) to NULL; result will no
## longer be an S4 object
## converting counts to integer mode
order.deseq = merge_phyloseq(otu_table(counts(estimateSizeFactors(dds),
normalized=T), taxa_are_rows = T),
sample_data(order.phy),
phy_tree(order.phy),
tax_table(order.phy))
Compute appropriate univariate tests on normalized data
Location = sample_data(order.rel)$Location
rel.p = apply(otu_table(order.rel),1, function(x) wilcox.test(c(x)~Location)$p.value)
## Warning in wilcox.test.default(x = structure(c(0.00614334470989761,
## 0.00181159420289855, : cannot compute exact p-value with ties
## Warning in wilcox.test.default(x = structure(c(0.00068259385665529,
## 0.0072463768115942, : cannot compute exact p-value with ties
## Warning in wilcox.test.default(x = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, :
## cannot compute exact p-value with ties
## Warning in wilcox.test.default(x = structure(c(0.00068259385665529,
## 0.000452898550724638, : cannot compute exact p-value with ties
## Warning in wilcox.test.default(x = structure(c(0, 0, 0, 0,
## 0.000630119722747322, : cannot compute exact p-value with ties
## Warning in wilcox.test.default(x = structure(c(0, 0, 0, 0,
## 0.000630119722747322, : cannot compute exact p-value with ties
## Warning in wilcox.test.default(x = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, :
## cannot compute exact p-value with ties
## Warning in wilcox.test.default(x = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, :
## cannot compute exact p-value with ties
## Warning in wilcox.test.default(x = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, :
## cannot compute exact p-value with ties
## Warning in wilcox.test.default(x = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, :
## cannot compute exact p-value with ties
## Warning in wilcox.test.default(x = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, :
## cannot compute exact p-value with ties
## Warning in wilcox.test.default(x = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, :
## cannot compute exact p-value with ties
## Warning in wilcox.test.default(x = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, :
## cannot compute exact p-value with ties
## Warning in wilcox.test.default(x = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, :
## cannot compute exact p-value with ties
## Warning in wilcox.test.default(x = structure(c(0, 0.000452898550724638, :
## cannot compute exact p-value with ties
## Warning in wilcox.test.default(x = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, :
## cannot compute exact p-value with ties
## Warning in wilcox.test.default(x = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, :
## cannot compute exact p-value with ties
## Warning in wilcox.test.default(x = structure(c(0, 0.000452898550724638, :
## cannot compute exact p-value with ties
clr.p = apply(otu_table(order.clr),1, function(x) t.test(c(x)~Location)$p.value)
deseq.p = apply(otu_table(order.deseq),1, function(x) t.test(log10(c(x))~Location)$p.value)
univ.location.res = data.frame(taxa = tax_table(order.clr)[,"Order"], rel.p, clr.p, deseq.p)
Adjust the p-values using False Discovery Rate
univ.location.res$rel.fdr = p.adjust(univ.location.res$rel.p, method="fdr")
univ.location.res$clr.fdr = p.adjust(univ.location.res$clr.p, method="fdr")
univ.location.res$deseq.fdr = p.adjust(univ.location.res$deseq.p, method="fdr")
colSums(univ.location.res<0.05)
## Warning in Ops.factor(left, right): '<' not meaningful for factors
## Order rel.p clr.p deseq.p rel.fdr clr.fdr deseq.fdr
## NA 10 8 9 9 7 7
univ.location.res
## Order rel.p clr.p deseq.p
## 1948 Bacteroidales 1.453797e-11 9.983697e-01 4.902762e-01
## 1931 "Erysipelotrichales" 8.437764e-01 2.076959e-01 5.187488e-01
## 4911 Anaeroplasmatales 9.721986e-10 2.249341e-15 5.735335e-14
## 4220 Clostridiales 1.186926e-19 1.894648e-32 3.651509e-18
## 1026 Bacillales 8.032710e-08 1.301173e-06 4.190743e-06
## 1673 "Lactobacillales" 6.834879e-16 4.321846e-19 1.071339e-16
## 4789 Actinobacteridae 1.132115e-01 7.601720e-02 4.509191e-02
## 5995 Burkholderiales 7.245135e-02 8.371138e-02 1.021904e-01
## 5975 Neisseriales 3.437768e-01 3.224313e-01 1.034844e-01
## 5242 Sphingomonadales 7.253024e-02 9.877532e-02 2.072876e-01
## 1453 Rhodobacterales 3.437768e-01 3.224313e-01 1.066468e-01
## 455 Rhizobiales 3.118829e-01 3.227785e-01 8.148514e-01
## 6648 Myxococcales 3.118829e-01 3.227785e-01 8.165448e-01
## 3405 Enterobacteriales 2.819788e-02 3.464769e-02 3.530636e-02
## 2012 Pasteurellales 9.633972e-01 9.332040e-01 6.386468e-01
## 1847 Pseudomonadales 1.239806e-03 4.114033e-03 4.024734e-03
## 2172 Chloroplast 8.788168e-05 3.013895e-04 2.380779e-04
## 3806 Flavobacteriales 1.461646e-01 1.886147e-01 3.350350e-01
## 4609 Mycoplasmatales 5.427940e-01 2.501740e-01 2.868498e-01
## 3348 Desulfovibrionales 1.954624e-02 9.799033e-01 2.337175e-01
## 2626 Coriobacteridae 1.557592e-11 1.459745e-12 3.481561e-12
## rel.fdr clr.fdr deseq.fdr
## 1948 8.177357e-11 9.983697e-01 6.052069e-01
## 1931 8.859653e-01 3.355087e-01 6.052069e-01
## 4911 4.083234e-09 1.574539e-14 4.014735e-13
## 4220 2.492544e-18 3.978760e-31 7.668168e-17
## 1026 2.811448e-07 5.464927e-06 1.760112e-05
## 1673 7.176623e-15 4.537938e-18 1.124905e-15
## 4789 1.828801e-01 1.757939e-01 1.052145e-01
## 5995 1.269279e-01 1.757939e-01 1.866319e-01
## 5975 4.010729e-01 3.765749e-01 1.866319e-01
## 5242 1.269279e-01 1.885711e-01 3.348492e-01
## 1453 4.010729e-01 3.765749e-01 1.866319e-01
## 455 4.010729e-01 3.765749e-01 8.165448e-01
## 6648 4.010729e-01 3.765749e-01 8.165448e-01
## 3405 5.921555e-02 9.095018e-02 9.267919e-02
## 2012 9.633972e-01 9.983697e-01 7.058727e-01
## 1847 3.254492e-03 1.234210e-02 1.207420e-02
## 2172 2.636450e-04 1.054863e-03 8.332725e-04
## 3806 2.192469e-01 3.300758e-01 4.397334e-01
## 4609 5.999302e-01 3.752611e-01 4.015897e-01
## 3348 4.560790e-02 9.983697e-01 3.505762e-01
## 2626 8.177357e-11 7.663660e-12 1.827820e-11
Plot the abundances using
plot_heatmap(),plot_bar(), or produce box and whisker plots
library(ggplot2)
plot_heatmap(order.rel, taxa.label = "Order", sample.order = "Location")+
facet_grid(.~Location, scales = "free_x")
plot_heatmap(order.clr, taxa.label = "Order", sample.order = "Location")+
facet_grid(.~Location, scales = "free_x")
plot_heatmap(order.deseq, taxa.label = "Order", sample.order = "Location")+
facet_grid(.~Location, scales = "free_x")
plot_bar(order.rel, fill="Location") +
facet_wrap(facets = "Order", ncol=7, nrow=3, scales = "free_y")
plot_bar(order.clr, fill="Location") +
facet_wrap(facets = "Order", ncol=7, nrow=3, scales = "free_y")
plot_bar(order.deseq, fill="Location") +
facet_wrap(facets = "Order", ncol=7, nrow=3, scales = "free_y")
ggplot(psmelt(order.rel), aes(x=Location, y=Abundance)) +
geom_boxplot(aes(fill=Location)) + geom_jitter() +
facet_wrap(facets="Order", ncol=7, nrow=3, scales="free_y")
ggplot(psmelt(order.clr), aes(x=Location, y=Abundance)) +
geom_boxplot(aes(fill=Location)) + geom_jitter() +
facet_wrap(facets="Order", ncol=7, nrow=3, scales="free_y")
ggplot(psmelt(order.deseq), aes(x=Location, y=Abundance)) +
geom_boxplot(aes(fill=Location)) + geom_jitter() +
facet_wrap(facets="Order", ncol=7, nrow=3, scales="free_y") + scale_y_log10()
Subset the order level data to only fecal samples
fecal.rel = subset_samples(order.rel, Location = "fecal")
Filter out taxa with low abundance (mean abundance < 0.1%)
fecal.rel.subs = subset_taxa(fecal.rel, rowMeans(otu_table(fecal.rel))> 0.001)
Perform univariate analysis of the fecal subset with respect to the Treatment variable
Treatment = sample_data(fecal.rel.subs)$Treatment
fecal.res = data.frame(taxa = tax_table(fecal.rel.subs)[,"Order"],
fecal.p = apply(otu_table(fecal.rel.subs),1,
function(x) kruskal.test(c(x)~Treatment)$p.value))
fecal.res$fecal.fdr = p.adjust(fecal.res$fecal.p, method="fdr")
ggplot(psmelt(fecal.rel.subs), aes(x=Treatment, y=Abundance)) +
geom_boxplot(aes(fill=Treatment)) + geom_jitter() +
facet_wrap(facets="Order", ncol=4, nrow=2, scales="free_y")
fecal.deseq = subset_samples(order.deseq, Location = "fecal")
pheno.res = data.frame(taxa = tax_table(fecal.deseq)[,"Order"],
BMD.p = apply(otu_table(fecal.deseq),1,
function(x) cor.test(log(c(x)), sample_data(fecal.deseq)$BMD)$p.value))
pheno.res$BMD.fdr = p.adjust(pheno.res$BMD.p, method="fdr")
pheno.res$pFAT.p = apply(otu_table(fecal.deseq),1,
function(x) cor.test(log(c(x)), sample_data(fecal.deseq)$pFat)$p.value)
pheno.res$pFAT.fdr = p.adjust(pheno.res$pFAT.p, method="fdr")
gFDR = with(pheno.res,
matrix(p.adjust(c(BMD.p, pFAT.p), method="fdr"),
ncol=2, nrow=nrow(pheno.res), byrow=F))
gFDR
## [,1] [,2]
## [1,] 0.8628829 0.8628829
## [2,] 0.8628829 0.3284212
## [3,] 0.8628829 0.8628829
## [4,] 0.8783478 0.8628829
## [5,] 0.8628829 0.8628829
## [6,] 0.9870451 0.9870451
## [7,] 0.8628829 0.9051185
## [8,] 0.9058371 0.8628829
## [9,] 0.8628829 0.8628829
## [10,] 0.8628829 0.8628829
## [11,] 0.8628829 0.8628829
## [12,] 0.8628829 0.8628829
## [13,] 0.8628829 0.8828143
## [14,] 0.8628829 0.8628829
## [15,] 0.8628829 0.8628829
## [16,] 0.8628829 0.8628829
## [17,] 0.9247915 0.3284212
## [18,] 0.8628829 0.9870451
## [19,] 0.8628829 0.9247915
## [20,] 0.8628829 0.9910441
## [21,] 0.9576660 0.9576660
ggplot(subset(psmelt(fecal.deseq), Order=="Enterobacteriales"), aes(x=BMD, y=Abundance)) +
geom_point() + scale_y_log10()
ggplot(subset(psmelt(fecal.deseq), Order=="Enterobacteriales"), aes(x=pFat, y=Abundance)) +
geom_point() + scale_y_log10()
ggplot(sample_data(fecal.deseq), aes(x=Treatment, y=BMD)) +
geom_boxplot(aes(fill=Treatment))+geom_jitter()
ggplot(sample_data(fecal.deseq), aes(x=Treatment, y=pFat)) +
geom_boxplot(aes(fill=Treatment))+geom_jitter()